In [170]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import linear_model
import numpy as np
In [171]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':float, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
regressionDir = '/home/weenkus/workspace/Machine Learning - University of Washington/Regression/datasets/'
sales = pd.read_csv(regressionDir + 'kc_house_data.csv', dtype = dtype_dict)
train_data = pd.read_csv(regressionDir + 'kc_house_train_data.csv', dtype = dtype_dict)
test_data = pd.read_csv(regressionDir + 'kc_house_test_data.csv', dtype = dtype_dict)
#sales = sales.sort(['sqft_living','price'])
In [172]:
sales.head()
Out[172]:
In [173]:
def get_numpy_data(data_sframe, features, output):
data_sframe['constant'] = 1 # add a constant column to an SFrame
# prepend variable 'constant' to the features list
features = ['constant'] + features
# select the columns of data_SFrame given by the ‘features’ list into the SFrame ‘features_sframe’
features_sframe = data_sframe[features]
# this will convert the features_sframe into a numpy matrix with GraphLab Create >= 1.7!!
features_matrix = np.matrix(features_sframe)
# assign the column of data_sframe associated with the target to the variable ‘output_sarray’
output_sarray = data_sframe[output]
# this will convert the SArray into a numpy array:
output_array = np.array(output_sarray) # GraphLab Create>= 1.7!!
return(features_matrix, output_array)
In [174]:
def predict_output(feature_matrix, weights):
predictions = np.dot(feature_matrix, weights)
return(predictions)
In [175]:
def feature_derivative_ridge(errors, feature, weight, l2_penalty, feature_is_constant):
#2*SUM[ error*[feature_i] ] + 2*l2_penalty*w[i]
if feature_is_constant:
derivative = 2 * np.dot(errors, feature) + 2 * l2_penalty * weight
else:
derivative = 2 * np.dot(errors, feature)
return derivative
In [176]:
# Testing the above functions
(example_features, example_output) = get_numpy_data(sales, ['sqft_living'], 'price')
my_weights = np.array([1., 10.])
test_predictions = predict_output(example_features, my_weights)
errors = test_predictions - example_output # prediction errors
# next two lines should print the same values
print (feature_derivative_ridge(errors, example_features[:,1], my_weights[1], 1, False))
print (np.sum(errors*example_features[:,1])*2+20.)
print ('')
# next two lines should print the same values
print (feature_derivative_ridge(errors, example_features[:,0], my_weights[0], 1, True))
print (np.sum(errors)*2.)
In [177]:
def ridge_regression_gradient_descent(feature_matrix, output, initial_weights, step_size, l2_penalty, max_iterations=100):
weights = np.array(initial_weights) # make sure it's a numpy array
#while not reached maximum number of iterations:
j = 0
while(j != max_iterations):
# compute the predictions using your predict_output() function
predictions = predict_output(feature_matrix, weights)
# compute the errors as predictions - output
errors = predictions - output
for i in range(len(weights)): # loop over each weight
# Recall that feature_matrix[:,i] is the feature column associated with weights[i]
# compute the derivative for weight[i].
if i != 0:
derivative = feature_derivative_ridge(errors, feature_matrix[:,i], weights[i], l2_penalty, True)
#(Remember: when i=0, you are computing the derivative of the constant!)
else:
derivative = feature_derivative_ridge(errors, feature_matrix[:,i], weights[i], l2_penalty, False)
# subtract the step size times the derivative from the current weight
weights[i] = weights[i] - (step_size * derivative)
j += 1
return weights
In [178]:
simple_features = ['sqft_living']
my_output = 'price'
(simple_feature_matrix, output) = get_numpy_data(train_data, simple_features, my_output)
(simple_test_feature_matrix, test_output) = get_numpy_data(test_data, simple_features, my_output)
In [206]:
step_size = 1e-12
max_iterations = 1000
initial_weights = [0. , 0.]
l2_penalty = 0.0
simple_weights_0_penalty = ridge_regression_gradient_descent(simple_feature_matrix,
output, initial_weights, step_size, l2_penalty, max_iterations)
In [180]:
print (simple_weights_0_penalty)
In [207]:
l2_penalty = 1e11
simple_weights_high_penalty = ridge_regression_gradient_descent(simple_feature_matrix,
output, initial_weights, step_size, l2_penalty, max_iterations)
print (simple_weights_high_penalty)
In [208]:
%matplotlib inline
plt.plot(simple_feature_matrix,output,'k.',
simple_feature_matrix,predict_output(simple_feature_matrix, simple_weights_0_penalty),'b-',
simple_feature_matrix,predict_output(simple_feature_matrix, simple_weights_high_penalty),'r-')
In [209]:
RSS_Low_p = np.square(predict_output(simple_test_feature_matrix, simple_weights_0_penalty) - test_output).sum()
RSS_High_p = np.square(predict_output(simple_test_feature_matrix, simple_weights_high_penalty) - test_output).sum()
RSS_0_weights = np.square(predict_output(simple_test_feature_matrix, initial_weights) - test_output).sum()
print (RSS_Low_p)
print (RSS_High_p)
print (RSS_0_weights)
In [218]:
model_features = ['sqft_living', 'sqft_living15']
my_output = 'price'
(feature_matrix, output) = get_numpy_data(train_data, model_features, my_output)
(test_feature_matrix, test_output) = get_numpy_data(test_data, model_features, my_output)
In [219]:
step_size = 1e-12
max_iterations = 1000
initial_weights = [0. , 0., 0.]
l2_penalty = 0.0
multiple_weights_0_penalty = ridge_regression_gradient_descent(feature_matrix, output, initial_weights,
step_size, l2_penalty, max_iterations)
In [220]:
print (multiple_weights_0_penalty)
In [221]:
l2_penalty = 1e11
multiple_weights_high_penalty = ridge_regression_gradient_descent(feature_matrix, output,
initial_weights, step_size, l2_penalty, max_iterations)
print(multiple_weights_high_penalty)
In [224]:
RSS_Low_p = np.square(predict_output(test_feature_matrix, multiple_weights_0_penalty) - test_output).sum()
RSS_High_p = np.square(predict_output(test_feature_matrix, multiple_weights_high_penalty) - test_output).sum()
RSS_0_weights = np.square(predict_output(test_feature_matrix, initial_weights) - test_output).sum()
print (RSS_Low_p)
print (RSS_High_p)
print (RSS_0_weights)
In [244]:
print(predict_output(test_feature_matrix, multiple_weights_0_penalty)-test_output)
In [246]:
print(predict_output(test_feature_matrix, multiple_weights_high_penalty)-test_output)
# Looking at the absolute value the first house is better predicted for the high penalty model
# (40k error vs 70k error for the low penalty model)
In [ ]: